In [1]:
import numpy as np
import pandas as pd
from itertools import cycle
from scipy import interp

# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# TensorFlow
import tensorflow as tf
from tensorflow.estimator import LinearClassifier

# Visualisation libraries

## IPython
from IPython.display import clear_output

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')

## matplotlib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")
Tensorflow Boosted Trees Classifier

In this article, we demonstrate solving a classification problem in TensorFlow using Estimators using the Credit dataset. This dataset can be extracted from the ISLR package using the following syntax.

library (ISLR)
write.csv(Default, "Default.csv")

Dataset Information:

The Credit dataset contains the balance (average credit card debt for a number of individuals) as well as several quantitative predictors: age, cards (number of credit cards), education (years of education), income (in thousands of dollars), limit (credit limit), and rating (credit rating).

In [2]:
fig, ax = plt.subplots(1, 1, figsize= (16, 2))
_ = ax.text(0.36, 0.5, "Default", color = 'Indigo', size=60, rotation=0.,ha="center", va="center",
            bbox=dict(boxstyle="round4", ec='Indigo', fc='Plum'))
_ = ax.text(0.8, 0.8, "Dataset", size=60, color = 'Plum', rotation=0.,ha="right", va="top",
            bbox=dict(boxstyle="round4", ec='Indigo', fc='Indigo'))
_ = ax.axis('tight')
_ = ax.axis('off')

Data = pd.read_csv('Data/Default.csv', index_col = 0)
Data.columns = [x.title() for x in Data.columns]
display(Data.head(5))
Default Student Balance Income
1 No No 729.526495 44361.625074
2 No Yes 817.180407 12106.134700
3 No No 1073.549164 31767.138947
4 No No 529.250605 35704.493935
5 No No 785.655883 38463.495879
Attribute Description
Default A factor with levels No and Yes indicating whether the customer defaulted on their debt
Student A factor with levels No and Yes indicating whether the customer is a student
Balance The average balance that the customer has remaining on their credit card after making their monthly payment
Income Income of customer
In [3]:
Temp = Data.copy()
Temp['Student'] = pd.factorize(Temp['Student'])[0]
with sns.axes_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8}):
    g = sns.pairplot(Temp, corner=True, hue = 'Default', height = 2.5, palette="husl",
                     diag_kind="kde", markers="o",
                     plot_kws=dict(s=50, edgecolor="b", linewidth=1), diag_kws=dict(shade=True));
_ = g.axes[2,0].set_xticks([0, 1])    
_ = g.axes[2,0].set_xticklabels(['No', 'Yes'])
In [4]:
def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out ['Size'] = Inp.shape[0]
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    Out.index.name = 'Features'
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

Data_info(Data)
Out[4]:
Data Type Number of NaN Values Size Percentage
Features
Balance float64 0 10000 0.0
Default object 0 10000 0.0
Income float64 0 10000 0.0
Student object 0 10000 0.0

Data Correlations

First off, the outcome variable here is Default. We have,

In [5]:
Target = 'Default'
Temp = pd.factorize(Data['Default'])
Data['Default'] = Temp[0]
Labels = Temp[1].tolist()
del Temp

Moreover, there is another categorical variable, Student.

In [6]:
Data['Student'] = pd.factorize(Data['Student'])[0]

Nowm, let's take a look at the variance of the features.

In [7]:
Fig, ax = plt.subplots(figsize=(4,3))
Temp = Data.drop(columns = [Target]).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True,  cmap =sns.color_palette("OrRd", 20),
                  linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0],  annot_kws={"size": 12},
                  cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')

Furthermore, we would like to standardize features by removing the mean and scaling to unit variance.

In [8]:
# Scaling
Temp = Data.drop(columns = Target).columns.tolist()
scaler = StandardScaler()
_ = scaler.fit(Data[Temp])
Data[Temp] = scaler.transform(Data[Temp])

Temp = Data.drop(columns = [Target]).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T

# Variance Plot
Fig, ax = plt.subplots(figsize=(4,3))
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True,  cmap =sns.color_palette('Greens'),
                  linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 12},
                  cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')

Train and Test sets

In [9]:
X = Data.copy()
y = Data.pop(Target)
Test_Size = 0.3

def Sets_Plot(Data, Test_Size):
    Temp = pd.DataFrame({'Set': ['Train', 'Test'],
             'Number of Instances':[int(Data.shape[0]*(1-Test_Size)), int(Data.shape[0]*Test_Size)]})
    Temp['Percentage'] = np.round(100* Temp['Number of Instances'].values /Temp['Number of Instances'].sum(), 2)
    fig = px.bar(Temp, y= ['',''], x= 'Number of Instances', orientation='h', color = 'Set', text = 'Percentage',
                 color_discrete_sequence = ['PaleGreen', 'LightBlue'], height = 180)
    fig.update_layout(plot_bgcolor= 'white', legend_orientation='h', legend=dict(x=0, y=1.7),
                      xaxis = dict(tickmode = 'array', tickvals = [0, Data.shape[0]], ticktext = ['','']))
    fig.update_traces(marker_line_color= 'Black', marker_line_width=1.5, opacity=1)
    fig.update_traces(texttemplate='%{text:.2}%      ', textposition='inside')
    fig.update_xaxes(title_text=None, range=[0, Data.shape[0]])
    fig.update_yaxes(title_text=None)
    fig.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= Test_Size, random_state=42)
display(pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
                           'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T)
Sets_Plot(Data, Test_Size)
Set X_train X_test y_train y_test
Shape (7000, 4) (3000, 4) (7000,) (3000,)

Input Function

The input function specifies how data is converted to a tf.data.Dataset that feeds the input pipeline in a streaming fashion. Moreover, an input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:

  • features - A Python dictionary in which:
    • Each key is the name of a feature.
    • Each value is an array containing all of that feature's values.
  • label - An array containing the values of the label for every example.
In [10]:
def input_fn(features, labels, training=True, batch_size=256):
    """An input function for training or evaluating"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

Moreover, an estimator model consists of two main parts, feature columns, and a numeric vector. Feature columns provide explanations for the input numeric vector. The following function separates categorical and numerical columns (features)and returns a descriptive list of feature columns.

In [11]:
def Feat_Columns(Inp):
    Temp = Inp.dtypes.reset_index(drop = False)
    Temp.columns = ['Features', 'Data Type']
    Temp['Data Type'] = Temp['Data Type'].astype(str)
    # Numeric_Columns
    Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
    # Categorical_Columns
    Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
        
    # Feature Columns
    feature_columns = []
    if len(Categorical_Columns)>0:
        for feature_name in Categorical_Columns:
          vocabulary = Inp[feature_name].unique()
          feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
    if len(Numeric_Columns)>0:
        for feature_name in Numeric_Columns:
          feature_columns.append(tf.feature_column.numeric_column(feature_name))
    return feature_columns

my_feature_columns = Feat_Columns(X)

Boosted Trees Classifier

In [12]:
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
                                                 n_batches_per_layer= 1,
                                                 n_classes= len(Labels),
                                                 learning_rate=0.1)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
accuracy accuracy_baseline auc auc_precision_recall average_loss label/mean loss precision prediction/mean recall global_step
1.0 0.9687 1.0 1.0 0.0 0.0313 0.0 1.0 0.0313 1.0 600

Predictions

In [13]:
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])

ROC Curves

In [14]:
def ROC_Curve(y_test, probs, n_classes, FS = 7, ax = False, pad = 0.01):
    # converting y_test to categorical
    y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=n_classes, dtype='float32')
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_cat[:, i], probs[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test_cat.ravel(), probs.ravel())
    roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    if ax == False:
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(FS, FS))
        
    _ = ax.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
                color='deeppink', linestyle=':', linewidth=4)
    _ = ax.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]),
                color='navy', linestyle=':', linewidth=4)
    colors = cycle(['Aqua', 'DarkOrange', 'CornflowerBlue'])

    for i, color in zip(range(n_classes), colors):
        _ = ax.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

    _ = ax.plot([0, 1], [0, 1], 'k--', lw=2)
    _ = ax.set_xlabel('False Positive Rate (FPR)')
    _ = ax.set_ylabel('True Positive Rate (TPR)')
    _ = ax.set_title('Receiver Operating Characteristic (ROC) Curves')
    _ = ax.legend(loc="lower right", fontsize = 12)
    _ = ax.set_xlim([-pad,1+pad])
    _ = ax.set_ylim([-pad,1+pad])
    # end
    
ROC_Curve(y_test, probs, n_classes = len(Labels), FS = 7)

Confusion Matrix

In [15]:
# Test set
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Confusion_Matrix = metrics.confusion_matrix(y_test, y_pred)

Results = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
display(Results.round(2))

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)
precision recall f1-score support
No 1.0 1.0 1.0 2906.0
Yes 1.0 1.0 1.0 94.0
accuracy 1.0 1.0 1.0 1.0
macro avg 1.0 1.0 1.0 3000.0
weighted avg 1.0 1.0 1.0 3000.0

Boosted Trees Classifier with $l_1$ regularization (Lasso)

Lasso (least absolute shrinkage and selection operator) classifier was introduced within the context of the method of least squares. Lasso) alters the model fitting process to pick only a subset of the provided covariates to be used within the final model instead of using all of them and this will improve the prediction accuracy and interpretability of regression models.

In [16]:
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
                                                 n_batches_per_layer= 1,
                                                 n_classes= len(Labels),
                                                 n_trees=120,
                                                 max_depth=5,
                                                 learning_rate=0.1,
                                                 l1_regularization= 1e-3)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
accuracy accuracy_baseline auc auc_precision_recall average_loss label/mean loss precision prediction/mean recall global_step
1.0 0.9687 1.0 1.0 0.0015 0.0313 0.0015 1.0 0.0317 1.0 600

Predictions

In [17]:
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])

ROC Curves

In [18]:
ROC_Curve(y_test, probs, n_classes = len(Labels), FS = 7)

Confusion Matrix

In [19]:
# Test set
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Confusion_Matrix = metrics.confusion_matrix(y_test, y_pred)

Results = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
display(Results.round(2))

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)
precision recall f1-score support
No 1.0 1.0 1.0 2906.0
Yes 1.0 1.0 1.0 94.0
accuracy 1.0 1.0 1.0 1.0
macro avg 1.0 1.0 1.0 3000.0
weighted avg 1.0 1.0 1.0 3000.0

Boosted Trees Classifier with $l_2$ regularization (Ridge)

In [20]:
tf.keras.backend.clear_session()
IT = int(5e3)
# Classifier
classifier = tf.estimator.BoostedTreesClassifier(feature_columns=my_feature_columns,
                                                 n_batches_per_layer= 1,
                                                 n_classes= len(Labels),
                                                 n_trees=120,
                                                 max_depth=5,
                                                 learning_rate=0.1,
                                                 l2_regularization= 1e-3)
# Training
classifier.train(input_fn=lambda: input_fn(X_train, y_train, training=True), max_steps = IT)
result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training=False))
clear_output()
display(pd.DataFrame(result, index = ['']).round(4))
accuracy accuracy_baseline auc auc_precision_recall average_loss label/mean loss precision prediction/mean recall global_step
1.0 0.9687 1.0 1.0 0.0 0.0313 0.0 1.0 0.0313 1.0 600

Predictions

In [21]:
pred_dicts = list(classifier.predict(input_fn=lambda: input_fn(X_test, y_test, training=False)))
clear_output()
probs = np.array([pred['probabilities'] for pred in pred_dicts])

ROC Curves

In [22]:
ROC_Curve(y_test, probs, n_classes = len(Labels), FS = 7)

Confusion Matrix

In [23]:
# Test set
y_pred = np.argmax(probs, axis = 1).reshape(-1,1)
Confusion_Matrix = metrics.confusion_matrix(y_test, y_pred)

Results = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
display(Results.round(2))

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)
precision recall f1-score support
No 1.0 1.0 1.0 2906.0
Yes 1.0 1.0 1.0 94.0
accuracy 1.0 1.0 1.0 1.0
macro avg 1.0 1.0 1.0 3000.0
weighted avg 1.0 1.0 1.0 3000.0

References

  1. Regression analysis. Wikipedia. Last edited on 17 April 2020, at 13:31 (UTC). https://en.wikipedia.org/wiki/Regression_analysis
  2. Tensorflow tutorials, https://www.tensorflow.org/tutorials
  3. TensorFlow Boosted Trees Classifier, https://www.tensorflow.org/api_docs/python/tf/estimator/BoostedTreesClassifier?version=nightly
  4. Lasso (statistics), https://en.wikipedia.org/wiki/Lasso_(statistics).
  5. Tikhonov regularizationm https://en.wikipedia.org/wiki/Tikhonov_regularization.
  6. James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, pp. 3-7). New York: springer.
  7. Jordi Warmenhoven, ISLR-python
  8. James, G., Witten, D., Hastie, T., & Tibshirani, R. (2017). ISLR: Data for an Introduction to Statistical Learning with Applications in R